Summary
Coronavirus disease 2019 is a new coronavirus that can be spread from person to person. This virus was first detected in Wuhan, China and was first reported to WHO (World Health Organization) on 31 December, 2019. A global pandemic was declared on 11 March, 2020.
The dataset used in this project can be found on a publicly available data repository which is created and maintained by the Johns Hopkins University Center for Systems Science and Engineering. At the time of this project, the dataset is current as of 11 July, 2020. This data is being updated daily, code can be reproduced to reflect the most current data.
Preparations
Packages
Load pre-defined R packages: “tidyverse”, “scales”, “reactable”, “htmltools”, “gganimate”, “ggthemes”, and “coronavirus”.
library(tidyverse)
library(scales)
# data frames in coronavirus package are in tibble format
library(coronavirus)
library(reactable)
library(htmltools)
library(gganimate)
library(ggthemes)
Main Data
The following interactive table provides a general information on our main dataset, which is current as of 11 July, 2020. Viewers can browse more observations by utilizing the pagination feature and/or the search box at the upper right of the table.
# To view the coronavirus dataset
reactable::reactable(coronavirus, minRows = 5, searchable = TRUE)
At a Glance
Notice the dataset consists of 3 different types: “confirmed”, “death”, and “recovered”. We can take a look at how the number of cases in each 3 types changes over time.
#the different types in main dataset
unique(coronavirus$type)
## [1] "confirmed" "death" "recovered"
# data frame of interest
total_cases_vs_date <- coronavirus %>%
group_by(date, type) %>%
summarize(total_cases = sum(cases))
#plotting total_cases vs Date for these 3 types
ggplot(total_cases_vs_date, aes(date,total_cases)) +
geom_line() +
facet_wrap(~type, ncol=1,scales="free_y") +
my_theme +
ylab("Total cases")

Confirmed Cases
Worldwide confirmed cases
The following graph sugguests that sometime after March, the total worldwide confirmed cases rises almost linearly over time and it does not show any sign of slowing down. We will examine whether the increasing behavior is linear in next sections.
# extract the data of interest from the main coronavirus dataset
confirmed_cases_worldwide <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(date) %>%
summarize(cases = sum(cases)) %>%
mutate(cum_cases = cumsum(cases)) %>%
select(date, cum_cases)
# to visualize how quick the covid spread in the world
ggplot(data = confirmed_cases_worldwide, aes(date, cum_cases)) +
geom_line() +
my_theme

China vs. the World
The graph implies that China seems to be able to slow down the coronavirus as comparing to the rest of the world. The cumulative confirmed cases of non-China countries seems to increases at a steadily rate.
Notice there was a jump in China’s confirmed cases line on 13 Frebruary, 2020. This is actually due to the change in their reporting figures on that day, that instead of lab tests, CT scans were accepted as evidence of the COVID-19.
# extract dataset of interest
confirmed_cases_china_vs_world <- coronavirus %>%
filter(type == "confirmed") %>%
mutate(is_china = case_when(
country == "China" ~ "China",
country != "China" ~ "Not China"
)) %>%
select(is_china, date, cases) %>%
group_by(is_china, date) %>%
summarize(cases = sum(cases)) %>%
mutate(cum_cases = cumsum(cases))
# total number of confirmed cases
frmLast <- confirmed_cases_china_vs_world %>%
slice(which.max(date))
plt_cum_confirmed_cases_china_vs_world <- ggplot(data = confirmed_cases_china_vs_world) +
geom_line(aes(date,cum_cases, group = is_china, color = is_china)) +
geom_point(data = frmLast, aes(date,cum_cases), col = "orange", shape = 21, fill = "white", size = 2, stroke = 1.7, show.legend=FALSE) +
geom_text(data = frmLast, aes(date, cum_cases, label = cum_cases, size = 0.5, vjust = 1.5, hjust = 0.8),show.legend = FALSE) +
my_theme
# see the plot with events
plt_cum_confirmed_cases_china_vs_world +
geom_vline(data = who_events, aes(xintercept = date), linetype = "dashed") +
geom_text(data = who_events, aes(x = date, label = event), y = 5e6, size = 3)

China’s confirmed cases in detail
In this section, we want to see how the trend lines change before and after the COVID-19 being declared as a pandemic. To include China’s reporting figure change, we will look at the confirmed cases in two time frames: between 2020-02-15 and 2020-03-11, and after 2020-03-11.
china_after_feb15 <- confirmed_cases_china_vs_world %>%
filter(is_china == "China" & date >= "2020-02-15")
china_feb15_mar11 <- china_after_feb15 %>%
filter(date <= "2020-03-11")
china_after_mar11 <- china_after_feb15 %>%
filter(date >= "2020-03-11")
ggplot(china_after_feb15, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(data = china_feb15_mar11,method = "lm", se = FALSE, size = 0.7, aes(color = "blue")) +
geom_smooth(data = china_after_mar11,method = "lm", se = FALSE, size = 0.7, aes(color = "red")) +
my_theme +
scale_color_identity(name = "Trend Line",
labels = c("Between Feb15 and Mar11", "After Mar11"),
guide = "legend")

How about the rest of the world?
The following graph shows how the two trend lines of non-China countires differ before and after 11 March, 2020.
not_china <- confirmed_cases_china_vs_world %>%
filter(is_china != "China")
not_china_feb15_mar11 <- not_china %>%
filter(date >= "2020-02-15" & date <= "2020-03-11")
not_china_after_mar11 <- not_china %>%
filter(date >= "2020-03-11")
ggplot(data = not_china, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth(data = not_china_feb15_mar11, method = "lm", se=FALSE, size = 0.7, aes(color = "blue")) +
my_theme +
geom_smooth(data = not_china_after_mar11, method = "lm", se = FALSE, size = 0.7, aes(color = "red")) +
scale_color_identity(name = "Trend Line",
labels = c("Between Feb15 and Mar11", "After Mar11"),
guide = "legend")

When we plot worldwide confirmed cases, the graph seems to suggest that the increasing behavior is very close to linear. In fact, the following graph, specifically on non-China countires, shows that the spread is actually increasing at an exponential rate.
plt_not_china_trend_line_all <- ggplot(data = not_china, aes(x = date, y = cum_cases)) +
geom_line() +
geom_smooth( method = "lm", se=FALSE, size = 0.7) +
my_theme +
scale_y_log10(labels = comma_format())
plt_not_china_trend_line_all

Top 10 hardest hit countries
The table below shows the top 10 countires that are affected the most due to COVID-19.
# extract data frame of interest
confirmed_cases_by_country <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(date, country, province) %>%
summarize(cases = sum(cases)) %>%
select(country, province, date, cases) %>%
group_by(country) %>%
mutate(cum_cases = cumsum(cases))
# extract top 10 countries with hardest hit
confirmed_cases_top10 <- confirmed_cases_by_country %>%
group_by(country) %>%
summarize(total_cases = max(cum_cases)) %>%
top_n(10) %>%
arrange(desc(total_cases))
# view them in a table
confirmed_cases_top10
## # A tibble: 10 x 2
## country total_cases
## <chr> <int>
## 1 US 3245925
## 2 Brazil 1839850
## 3 India 849522
## 4 Russia 719449
## 5 Peru 322710
## 6 Chile 312029
## 7 Mexico 295268
## 8 United Kingdom 290504
## 9 South Africa 264184
## 10 Iran 255117
Plotting Top 5 hardest hit countries
Visualizing top 5 countires that are affected the most and their trajectories.
# confirmed cases top 5
confirmed_cases_top5_country <- confirmed_cases_by_country %>%
filter(country %in% confirmed_cases_top10$country[1:5]) %>%
select(country, date, cum_cases)
ggplot(data = confirmed_cases_top5_country, aes(x = date, y = cum_cases, color = country, group = country)) +
geom_line() +
my_theme

Death Cases
Top 25 Death count by country
The following graph gives the top 25 countries that have the most number of death due to COVID-19 and their death counts.
top_25_death_count <- coronavirus %>%
filter(type == "death") %>%
group_by(country) %>%
summarize(total_death = sum(cases)) %>%
arrange(desc(total_death)) %>%
top_n(25)
top_25_death_count$country = factor(top_25_death_count$country, levels = top_25_death_count$country)
top_25_death_count$angle = 1:25 * 360/25
plt_top_25_death_count <- ggplot(top_25_death_count, aes(country, total_death, fill = total_death)) +
geom_col(width = 1, color = 'grey90') +
geom_col(aes(y=I(5)), width=1, fill='grey90', alpha = 0.2) +
geom_col(aes(y=I(3)), width=1, fill='grey90', alpha = 0.2) +
geom_col(aes(y=I(2)), width=1, fill = "white") +
scale_y_log10() +
scale_fill_gradientn(colors = c("darkgreen","green","orange","firebrick","red"),trans ="log") +
geom_text(aes(label = paste(country, total_death, sep= "\n"),
y = total_death * 0.5, angle= angle),
data = function(top_25_death_count) top_25_death_count[top_25_death_count$total_death > 8000,],
size = 2.5, color = "white", fontface = "bold", vjust =1) +
geom_text(aes(label = paste0(total_death, " death ", country),
y =max(total_death)*1.5, angle = angle+90),
data = function(top_25_death_count) top_25_death_count[top_25_death_count$total_death < 8000,],
size = 2.5, vjust = 0.5) +
coord_polar(direction=-1) +
theme_void() +
theme(legend.position="none")
plt_top_25_death_count

Overall changes over time
United States rank the top 1 country with most death count. The following plots shows how death count changes from 30 March, 2020 to the current date for the top 25 countries.
death_count<- coronavirus %>%
filter(type == "death") %>%
group_by(country,date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_death = cumsum(cases))
death_count_data <- death_count %>%
filter(country %in% top_25_death_count$country)
df.death_count_data <- death_count_data %>%
filter(date >= "2020-03-30") %>%
group_by(date) %>%
mutate(ordering = rank(total_death)) %>%
ungroup()
plt_death_count_data <-ggplot(df.death_count_data, aes(ordering,group = country, fill = country)) +
geom_tile(aes(y = total_death/2,
height = total_death,
width = 0.8), alpha = 0.9, size =0.8) +
geom_text(aes(y = total_death, label = paste(total_death)), hjust =-.4,size = 3)+
geom_text(aes( y = 0, label = paste(country)), hjust = 1, size = 3) +
coord_flip(clip = "off", expand = FALSE) +
scale_color_viridis_d(name = "")+
scale_fill_viridis_d(name="")+
scale_y_continuous() +
theme_tufte(10, "Avenir") +
guides(color=F, fill=F) +
theme(plot.title = element_text(hjust = 0, size = 15),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.title.y = element_blank(),
axis.ticks = element_blank(),
plot.margin = margin(2,2,2,4,"cm")) +
labs(title = "Date: {frame_time}", subtitle = "Top 25 Countries", y = "Death count") +
transition_time(date) +
ease_aes('cubic-in-out')
animate(plt_death_count_data, fps=10, duration = 15, end_pause = 30, rewind = FALSE)

Recovered Cases
Percent of daily recovered cases
Here we are interested in how the percentage of daily recovered cases change over time, we calculate this percentage as:
\(recover\;percent_{i}=\frac{total\; recoverd\;cases_{i}}{total_\;confirmed\;cases_{i}}\:\:\:i\in dataset\;date\;range \:\:for\;each\;country\)
The following plot shows how the percentage change over time for the top 5 countries that were hitted the hardest. Notice the start tracking date is different for each country as it is in respective to their date where the first confirmed cases were found.
#cumulative confirmed cases by date for each country
confirmed_country_date <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country, date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_confirmed = cumsum(cases)) %>%
select(country, date, total_confirmed)
confirmed_country_date_top5 <- confirmed_country_date %>%
filter(country %in% confirmed_cases_top10$country[1:5])
#cumulative recovered cases by date for each country
recovered_country_date <- coronavirus %>%
filter(type == "recovered") %>%
group_by(country, date) %>%
summarize(cases = sum(cases)) %>%
mutate(total_recovered = cumsum(cases)) %>%
select(country, date, total_recovered)
recovered_country_date_top5 <- recovered_country_date %>%
filter(country %in% confirmed_cases_top10$country[1:5])
#final data frame of interest
recover_percent_tbl <- inner_join(confirmed_country_date_top5, recovered_country_date_top5, by = c("date", "country")) %>%
mutate(recover_percent = total_recovered/total_confirmed) %>%
filter(total_confirmed !=0)
#plot
plt_recover_percent <- ggplot(recover_percent_tbl, aes(x = date, y = recover_percent, group = country, color = country)) +
geom_line() +
scale_y_continuous(labels=scales::percent) +
scale_color_viridis_d() +
theme_classic() +
labs(x = "Dates", y = "Daily Recovered Percentage") +
theme(legend.position = "top") +
geom_point() +
transition_reveal(date)
animate(plt_recover_percent, rewind = FALSE, end_pause= 30)

Case Counts and Percantages
#cumulative confirmed cases by date for each country
confirmed_country <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_confirmed= cases) %>%
select(country, ttl_confirmed)
recovered_country <- coronavirus %>%
filter(type == "recovered") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_recovered= cases) %>%
select(country, ttl_recovered)
death_country <- coronavirus %>%
filter(type == "death") %>%
group_by(country) %>%
summarize(cases = sum(cases)) %>%
mutate(ttl_death= cases) %>%
select(country, ttl_death)
final_tbl <- confirmed_country %>%
inner_join(recovered_country, by = "country") %>%
mutate(recovered_pct = as.numeric(format(round(ttl_recovered/ttl_confirmed,4), nsmall = 4))) %>%
inner_join(death_country, by = "country") %>%
mutate(death_pct = as.numeric(format(round(ttl_death/ttl_confirmed,4), nsmall = 4))) %>%
filter(ttl_confirmed !=0) %>%
select(country, ttl_confirmed, ttl_recovered, ttl_death, recovered_pct, death_pct) %>%
arrange(desc(ttl_confirmed))
Chart for summary table
The following gives an interactive table, which has the percentage of the recovered cases and the percentage of death cases for each country.
bar_chart <- function(label, value, height = "20px", fill = "#00bfc4", background = NULL){
width <- paste0(value * 100, "%")
bar <- div(style = list(background = fill, width = width, height = height))
chart <- div(style = list(flexGrow = 1, align= "right",background = background), bar)
div(style=list(display = "flex", align = "right"),chart, label)
}
reactable::reactable(final_tbl, minRows = 5, searchable = TRUE,
columns = list(
ttl_confirmed = colDef(align = "center",format = colFormat(separators = TRUE)),
ttl_recovered = colDef(align = "center",format = colFormat(separators = TRUE)),
ttl_death = colDef(align = "center",format = colFormat(separators = TRUE)),
recovered_pct = colDef(align = "center", cell = function(value){
label <- paste0(round(value*100,digits = 2),"% ")
bar_chart(label, value, background = "#e1e1e1")
},
),
death_pct = colDef(align="center", cell = function(value){
label <- paste0(round(value*100, digits = 2),"% ")
bar_chart(label, value, fill = "#fc5185", background = "#e1e1e1")
}
)
)
)